# sigrej replic syntax jan2024.R

#############
#
# figures
#
#############

# personalize by USER

library(ggplot2)

##
# open data compiled from Election Administration and Voting Survey (EAVS) various years
##

# USER: download and locate files
# setwd()
# eavsdata <- read.csv("sigrej_eavsdata_jan2024.csv")
head(eavsdata)


## figure 2

# compile data
pctmail_all <- c(eavsdata$pctabsmail10, eavsdata$pctabsmail12, eavsdata$pctabsmail14, eavsdata$pctabsmail16, eavsdata$pctabsmail18, eavsdata$pctabsmail20, eavsdata$pctabsmail22)
pctrej_all <- c(eavsdata$rejpct10, eavsdata$rejpct12, eavsdata$rejpct14, eavsdata$rejpct16, eavsdata$rejpct18, eavsdata$rejpct20, eavsdata$rejpct22)

pctmail_all <- na.omit(pctmail_all)
pctrej_all <- na.omit(pctrej_all)

# figure 2
peavsdata <- as.data.frame(cbind(pctmail_all, pctrej_all))
ggplot(eavsdata = peavsdata) + 
  geom_point(mapping = aes(x = pctmail_all, y = pctrej_all)) +
  geom_smooth(mapping = aes(x = pctmail_all, y = pctrej_all), se=FALSE, color="black") +
  theme(text = element_text(size = 9)) +
  labs(x="% of counted ballots cast by mail (including mailed absentee), 2010-2022", y='% of mail ballots rejected for "mismatched" signature, 2010-2022',)


## figure 1

peavsdata$year <- numeric(nrow(peavsdata))
peavsdata$year <- c(rep(2010, 22), rep(2012, 23), rep(2014, 26), rep(2016, 26), rep(2018, 26), rep(2020, 26), rep(2022, 24))
length(peavsdata$year)

# and state name lables
n1 <- eavsdata$state[is.na(eavsdata$rejpct10)==F]
n2 <- eavsdata$state[is.na(eavsdata$rejpct12)==F]
n3 <- eavsdata$state[is.na(eavsdata$rejpct22)==F]
sname <- c(n1, n2, rep(eavsdata$state, 4), n3)
length(sname)

peavsdata$sname <- sname

ggplot(peavsdata, aes(pctmail_all, pctrej_all, label=sname)) + 
  facet_wrap(vars(peavsdata$year), nrow=2, ncol=4) +
  geom_text(size=2) +
  theme(axis.title.x = element_text(size = 10),
        axis.title.y = element_text(size = 10)) +
  labs(x="% of counted ballots cast by mail (including mailed absentee), 2010-2022", y='% of mail ballots rejected for "mismatched" signature, 2010-2022')



## 
# PPV illustration
##

# generic formulation of Bayes' theorem: P(A|B) = ( P(B|A)*P(A) ) / P(B)
# PPV is positive predictive value. Here, probability a signature is invalid, given rejected
# to illustrate, PA is a sequence of potential values of the probability of invalid signature, PA
# and PAcom is the complement of PA, so that probabilities sum to 1
# which reflects the possibility that a valid signature could be wrongly rejected

PA <- seq(1/50000, 0.15, length.out=100)
PAcom <- 1-PA

# allow for 99% accuracy in assessing valid, 99% in assessing invalid signatures
# and allow for 95% accuracy in each
# and allow combinations of the two
PB1_a <- 0.99
PB2_a <- 0.01
PB1_b <- 0.95
PB2_b <- 0.01
PB1_c <- 0.99
PB2_c <- 0.05
PB1_d <- 0.95
PB2_d <- 0.05

#  curves based on these values
Y1 <- (PB1_a*PA)/((PB1_a*PA) + (PB2_a*PAcom))
Y2 <- (PB1_b*PA)/((PB1_b*PA) + (PB2_b*PAcom))
Y3 <- (PB1_c*PA)/((PB1_c*PA) + (PB2_c*PAcom))
Y4 <- (PB1_d*PA)/((PB1_d*PA) + (PB2_d*PAcom))

# data in long format for plotting
pmini <- as.data.frame(cbind(rep(PA,4), c(Y1, Y2, Y3, Y4),
                             c(rep("A: 99% if invalid, 99% if valid", 100),
                               rep("B: 95% if invalid, 99% if valid", 100),
                               rep("C: 99% if invalid, 95% if valid", 100),
                               rep("D: 95% if invalid, 95% if valid", 100))))
colnames(pmini) <- c("x", "y", "Accuracy")
pmini$x <- as.numeric(pmini$x) * 100
pmini$y <- as.numeric(pmini$y)
head(pmini); tail(pmini)

## Figure 3 in paper
ggplot(data = pmini) + 
  geom_line(mapping = aes(x = x, y = y, linetype=Accuracy), linewidth=0.4) +
  scale_x_continuous(limits = c(0, 15), expand=c(0,0)) +
  scale_y_continuous(limits = c(0, 1), expand=c(0,0)) +
  scale_linetype_manual(name="Accuracy of signature checks", values=c("dotted", "dashed", "solid", "twodash")) +
  labs(x="Assumed % of invalid signatures", y="Positive predictive value of mail ballot return signature checks") +
  theme(text = element_text(size = 9)) +
  annotate(geom="text", x=13.5, y=0.965, label="A", size=3) +
  annotate(geom="text", x=13.5, y=0.905, label="B", size=3) +
  annotate(geom="text", x=13.5, y=0.785, label="C", size=3) +
  annotate(geom="text", x=13.5, y=0.71, label="D", size=3)


#####################################
#
# examples of Bayes' theorem applied
#
#####################################

# We need 4 quantities (though two of them are related, since D is one minus B)

# A. The probability that a signature will be rejected if it is invalid.
# P(sig.rejected | sig.invalid)

# B. The probability that a signature is invalid
# P(sig. invalid)

# C. The probability that a signature will be rejected if it is valid
# P(sig.rejected | sig.valid)

# D. The probability that a signature is valid
# P(sig.valid) = 1-(P.sig.invalid)

# Bayes’ theorem yields probability that a rejected signature is actually invalid:
# PPV = A*B/(A*B+C*D)
# and number of wrongful rejections, for each rightful rejections, is (1-PPV)/PPV

##
## examples
## 

## first illustration in paper
A1 <- 0.98
B1 <- 0.00425
C1 <- 0.02
D1 <- 1 - B1
PPV1 <- A1*B1/(A1*B1+C1*D1); PPV1
# [1] 0.1729651
PPV1wrong <- (1-PPV1)/PPV1; PPV1wrong
# [1] 4.781513

##
## values for Table 1
##

## just vary assumed prevalence of invalid ballots

# first, assume 99% accuracy in rejecting invalid, 99% accuracy in accepting valid
# and assume worst-case fraud only, 1/4000 = 0.00025
# then assume worst-case fraud plus 1/1000 errors = 0.00125
# then assume worst-case fraud plus 1/500 errors = 0.00225
# then assume worst-case fraud plus 1/250 errors = 0.00425

A <- c(rep(0.99, 4))
B <- c(0.00025, 0.00125, 0.00225, 0.00425)
C <- c(rep(0.01, 4))
D <- 1 - B
PPV <- A*B/(A*B + C*D); round(PPV, 3)
#[1] 0.024 0.110 0.183 0.297
# and the number of wrongful rejections, per rightful
PPVwrong <- (1-PPV)/PPV; round(PPVwrong,3)
#[1] 40.394  8.071  4.479  2.367


## now also vary assumed accuracy

# assume 95% accuracy in rejecting invalid, 99% accuracy in accepting valid

A <- c(rep(0.95, 4))
B <- c(0.00025, 0.00125, 0.00225, 0.00425)
C <- c(rep(0.01, 4))
D <- 1 - B
PPV <- A*B/(A*B + C*D); round(PPV, 3)
#[1] 0.023 0.106 0.176 0.288
# and the number of wrongful rejections, per rightful
PPVwrong <- (1-PPV)/PPV; round(PPVwrong,3)
#[1] 42.095  8.411  4.668  2.466

# next, assume 99% accuracy in rejecting invalid, 95% accuracy in accepting valid

A <- c(rep(0.99, 4))
B <- c(0.00025, 0.00125, 0.00225, 0.00425)
C <- c(rep(0.05, 4))
D <- 1 - B
PPV <- A*B/(A*B + C*D); round(PPV, 3)
#[1] 0.005 0.024 0.043 0.078
# and the number of wrongful rejections, per rightful
PPVwrong <- (1-PPV)/PPV; round(PPVwrong,3)
#[1] 201.970  40.354  22.396  11.833


## appendix example
(0.9 * 0.00025 + 0.999 * 0.004) / (0.9 * 0.00025 + 0.999 * 0.004 + 0.02*0.99575)
# = 0.175 
# decomposition: 
# attempt fraud: (0.9*0.00025)/(0.9*0.00025 + 0.999*0.004 + 0.02*0.99575) = 1% of rejects
# voter error: (0.999*0.004)/(0.9*0.00025 + 0.999*0.004 + 0.02*0.99575) = 17% of rejects
# worker error: (0.02*0.99575)/(0.9*0.00025 + 0.999*0.004 + 0.02*0.99575) = 82% of rejects


############################
#
# Ohio cure rate analysis
#
############################

# Cuyahoga county, 2020 primary election, absentee ballot application rejections, and cures,
# for alleged signature mismatch

# original deadline for absentee ballot for the march 17, application deadline March 14
# 86 rejected applications in first two weeks of march, of which 28 cured

# after postponement on march 16, due to COVID-19, new deadline was april 25 
# 317 rejected applications from April 11 to 25, of which 36 cured

# Bayesian model
library(rstanarm)
set.seed(123)
n1 <- 86  # number of trials for group 1
n2 <- 317  # number of trials for group 2

successes_group1 <- 28
successes_group2 <- 36

oh_data <- data.frame(
            group = c("Group 1", "Group 2"),
            successes = c(successes_group1, successes_group2),
            trials = c(n1, n2)
            )

# Fit Bayesian binomial model; summarize results
oh_model <- stan_glm(cbind(successes, trials - successes) ~ group, data = oh_data, family = binomial(), chains = 4)
summary(oh_model, prob=c(0.005, 0.025, 0.975, 0.995), digits=2)
# the 99% credible interval for the group difference does not cover zero


##############################################################################
#
# Modeling effect of covid-affected primary on sig reject rate in the general
#
##############################################################################

mean(eavsdata$rejpct20[eavsdata$primchange20==0], na.rm=T)
# 0.26
mean(eavsdata$rejpct20[eavsdata$primchange20==1], na.rm=T)
# 0.11
sd(eavsdata$rejpct20, na.rm=T)
#[1] 0.1886552

## compare general election sig reject rates in places with/without covid-affected primary
set.seed(501)
fit1 <- stan_glm(rejpct20 ~ primchange20, data=eavsdata, family=gaussian())
summary(fit1, prob=c(0.005, 0.025, 0.975, 0.995), digits=3)
# rejection rate estimated to be 0.15% lower, nearly one SD
# the 95% credible interval for the group difference does not, but 99% does cover zero

## same, but include for each state a control for it's sig reject rate in 2018 election
set.seed(501)
fit2 <- stan_glm(rejpct20 ~ primchange20  + rejpct18, data=eavsdata, family=gaussian())
summary(fit2, prob=c(0.005, 0.025, 0.975, 0.995), digits=3)
# rejection rate estimated to be 0.16% lower, nearly one SD
# the 95% credible interval for the group difference does not, but 99% does cover zero

## same, but include for each state a control for it's sig reject rate in 2016 election
set.seed(501)
fit3 <- stan_glm(rejpct20 ~ primchange20  + rejpct16, data=eavsdata, family=gaussian())
summary(fit3, prob=c(0.005, 0.025, 0.975, 0.995), digits=3)
# rejection rate estimated to be 0.12% lower, nearly one SD
# the 95% credible interval for the group difference just crosses zero

## alternative research design: use first-differenced outcome variable
# (try both 2018 and 2016 as baselines)
eavsdata$sigrejdiff1820 <- eavsdata$rejpct20 - eavsdata$rejpct18
eavsdata$sigrejdiff1620 <- eavsdata$rejpct20 - eavsdata$rejpct16

set.seed(501)
fit4 <- stan_glm(sigrejdiff1820 ~ primchange20, data=eavsdata, family=gaussian())
summary(fit4, prob=c(0.005, 0.025, 0.975, 0.995), digits=3)
# _change_ in reject rate vs prev election is 0.25% lower, in states with covid-affected primary
# the 95% credible interval for the group difference crosses zero

set.seed(501)
fit5 <- stan_glm(sigrejdiff1620 ~ primchange20, data=eavsdata, family=gaussian())
summary(fit5, prob=c(0.005, 0.025, 0.975, 0.995), digits=3)
# _change_ in reject rate vs prev election is 0.09% lower, in states with covid-affected primary
# the 95% credible interval for the group difference crosses zero


# end of replication syntax